{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "7980c1ca",
   "metadata": {},
   "source": [
    "# Quickstart"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "8daaca79",
   "metadata": {},
   "outputs": [],
   "source": [
    "import shutil\n",
    "\n",
    "import lance\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import pyarrow as pa"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e6a08856",
   "metadata": {},
   "source": [
    "## Creating datasets\n",
    "\n",
    "Via pyarrow it's really easy to create lance datasets"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1af35939",
   "metadata": {},
   "source": [
    "Create a dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e595a4dc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   a\n",
       "0  5"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame({\"a\": [5]})\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "aaba1ba2",
   "metadata": {},
   "source": [
    "Write it to lance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "0f81f858",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   a\n",
       "0  5"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "shutil.rmtree(\"/tmp/test.lance\", ignore_errors=True)\n",
    "\n",
    "dataset = lance.write_dataset(df, \"/tmp/test.lance\")\n",
    "dataset.to_table().to_pandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c70bc8b7",
   "metadata": {},
   "source": [
    "### Converting from parquet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "6b059ca3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   a\n",
       "0  5"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "shutil.rmtree(\"/tmp/test.parquet\", ignore_errors=True)\n",
    "shutil.rmtree(\"/tmp/test.lance\", ignore_errors=True)\n",
    "\n",
    "tbl = pa.Table.from_pandas(df)\n",
    "pa.dataset.write_dataset(tbl, \"/tmp/test.parquet\", format='parquet')\n",
    "\n",
    "parquet = pa.dataset.dataset(\"/tmp/test.parquet\")\n",
    "parquet.to_table().to_pandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "796bd0ed",
   "metadata": {},
   "source": [
    "Write to lance in 1 line"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "49f3267e",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = lance.write_dataset(parquet, \"/tmp/test.lance\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "181dc238",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   a\n",
       "0  5"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# make sure it's the same\n",
    "dataset.to_table().to_pandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dd3bb968",
   "metadata": {},
   "source": [
    "## Versioning"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "afe83b1a",
   "metadata": {},
   "source": [
    "We can append rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "8d35cb2f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    a\n",
       "0   5\n",
       "1  10"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame({\"a\": [10]})\n",
    "tbl = pa.Table.from_pandas(df)\n",
    "dataset = lance.write_dataset(tbl, \"/tmp/test.lance\", mode=\"append\")\n",
    "\n",
    "dataset.to_table().to_pandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5fbe4d27",
   "metadata": {},
   "source": [
    "We can overwrite the data and create a new version"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "00a1d2ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame({\"a\": [50, 100]})\n",
    "tbl = pa.Table.from_pandas(df)\n",
    "dataset = lance.write_dataset(tbl, \"/tmp/test.lance\", mode=\"overwrite\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "999f9514",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>50</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     a\n",
       "0   50\n",
       "1  100"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset.to_table().to_pandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1780e384",
   "metadata": {},
   "source": [
    "The old version is still there"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "717e1d0e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'version': 1,\n",
       "  'timestamp': datetime.datetime(2024, 8, 15, 21, 22, 31, 453453),\n",
       "  'metadata': {}},\n",
       " {'version': 2,\n",
       "  'timestamp': datetime.datetime(2024, 8, 15, 21, 22, 35, 475152),\n",
       "  'metadata': {}},\n",
       " {'version': 3,\n",
       "  'timestamp': datetime.datetime(2024, 8, 15, 21, 22, 45, 32922),\n",
       "  'metadata': {}}]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset.versions()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "be1a982f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   a\n",
       "0  5"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lance.dataset('/tmp/test.lance', version=1).to_table().to_pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "4ee907e0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    a\n",
       "0   5\n",
       "1  10"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lance.dataset('/tmp/test.lance', version=2).to_table().to_pandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2065df4b",
   "metadata": {},
   "source": [
    "We can create tags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "bdeae709",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'nightly': {'version': 3, 'manifest_size': 628},\n",
       " 'stable': {'version': 2, 'manifest_size': 684}}"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset.tags.create(\"stable\", 2)\n",
    "dataset.tags.create(\"nightly\", 3)\n",
    "dataset.tags.list()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d60b9c04",
   "metadata": {},
   "source": [
    "which can be checked out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "1852c5b3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    a\n",
       "0   5\n",
       "1  10"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lance.dataset('/tmp/test.lance', version=\"stable\").to_table().to_pandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fd02c885",
   "metadata": {},
   "source": [
    "## Vectors"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b0e2ad5a",
   "metadata": {},
   "source": [
    "### Data preparation"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "881aba7e",
   "metadata": {},
   "source": [
    "For this tutorial let's use the Sift 1M dataset:\n",
    "\n",
    "- Download `ANN_SIFT1M` from: http://corpus-texmex.irisa.fr/\n",
    "- Direct link should be `ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz`\n",
    "- Download and then unzip the tarball"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "2ac159fa",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2023-02-13 16:54:50--  ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz\n",
      "           => ‘sift.tar.gz’\n",
      "Resolving ftp.irisa.fr (ftp.irisa.fr)... 131.254.254.45\n",
      "Connecting to ftp.irisa.fr (ftp.irisa.fr)|131.254.254.45|:21... connected.\n",
      "Logging in as anonymous ... Logged in!\n",
      "==> SYST ... done.    ==> PWD ... done.\n",
      "==> TYPE I ... done.  ==> CWD (1) /local/texmex/corpus ... done.\n",
      "==> SIZE sift.tar.gz ... 168280445\n",
      "==> PASV ... done.    ==> RETR sift.tar.gz ... done.\n",
      "Length: 168280445 (160M) (unauthoritative)\n",
      "\n",
      "sift.tar.gz         100%[===================>] 160.48M  6.85MB/s    in 36s     \n",
      "\n",
      "2023-02-13 16:55:29 (4.43 MB/s) - ‘sift.tar.gz’ saved [168280445]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!rm -rf sift* vec_data.lance\n",
    "!wget ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz\n",
    "!tar -xzf sift.tar.gz"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5786faee",
   "metadata": {},
   "source": [
    "Convert it to Lance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "63993672",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<lance.dataset.LanceDataset at 0x13859fe20>"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from lance.vector import vec_to_table\n",
    "import struct\n",
    "\n",
    "uri = \"vec_data.lance\"\n",
    "\n",
    "with open(\"sift/sift_base.fvecs\", mode=\"rb\") as fobj:\n",
    "    buf = fobj.read()\n",
    "    data = np.array(struct.unpack(\"<128000000f\", buf[4 : 4 + 4 * 1000000 * 128])).reshape((1000000, 128))\n",
    "    dd = dict(zip(range(1000000), data))\n",
    "\n",
    "table = vec_to_table(dd)\n",
    "lance.write_dataset(table, uri, max_rows_per_group=8192, max_rows_per_file=1024*1024)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "6b164493",
   "metadata": {},
   "outputs": [],
   "source": [
    "uri = \"vec_data.lance\"\n",
    "sift1m = lance.dataset(uri)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d9485bc3",
   "metadata": {},
   "source": [
    "### KNN (no index)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5d79e3ca",
   "metadata": {},
   "source": [
    "Sample 100 vectors as query vectors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "0c7132b8",
   "metadata": {
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     [29.0, 10.0, 1.0, 50.0, 7.0, 89.0, 95.0, 51.0,...\n",
       "1     [7.0, 5.0, 39.0, 49.0, 17.0, 12.0, 83.0, 117.0...\n",
       "2     [0.0, 0.0, 0.0, 10.0, 12.0, 31.0, 6.0, 0.0, 0....\n",
       "3     [0.0, 2.0, 9.0, 1.793662034335766e-43, 30.0, 1...\n",
       "4     [54.0, 112.0, 16.0, 0.0, 0.0, 7.0, 112.0, 44.0...\n",
       "                            ...                        \n",
       "95    [1.793662034335766e-43, 33.0, 47.0, 28.0, 0.0,...\n",
       "96    [1.0, 4.0, 2.0, 32.0, 3.0, 7.0, 119.0, 116.0, ...\n",
       "97    [17.0, 46.0, 12.0, 0.0, 0.0, 3.0, 23.0, 58.0, ...\n",
       "98    [0.0, 11.0, 30.0, 14.0, 34.0, 7.0, 0.0, 0.0, 1...\n",
       "99    [20.0, 8.0, 121.0, 98.0, 37.0, 77.0, 9.0, 18.0...\n",
       "Name: vector, Length: 100, dtype: object"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import duckdb\n",
    "# if this segfaults make sure duckdb v0.7+ is installed\n",
    "samples = duckdb.query(\"SELECT vector FROM sift1m USING SAMPLE 100\").to_df().vector\n",
    "samples"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1d07fb90",
   "metadata": {},
   "source": [
    "Call nearest neighbors (no ANN index here)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "879875b8",
   "metadata": {
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Time(sec): 0.10735273361206055\n",
      "       id                                             vector    score\n",
      "0  144678  [29.0, 10.0, 1.0, 50.0, 7.0, 89.0, 95.0, 51.0,...      0.0\n",
      "1  575538  [2.0, 0.0, 1.0, 42.0, 3.0, 38.0, 152.0, 27.0, ...  76908.0\n",
      "2  241428  [11.0, 0.0, 2.0, 118.0, 11.0, 108.0, 116.0, 21...  92877.0\n",
      "3  220788  [0.0, 0.0, 0.0, 95.0, 0.0, 8.0, 133.0, 67.0, 1...  93305.0\n",
      "4  833796  [1.0, 1.0, 0.0, 23.0, 11.0, 26.0, 140.0, 115.0...  95721.0\n",
      "5  919065  [1.0, 1.0, 1.0, 42.0, 96.0, 42.0, 126.0, 83.0,...  96632.0\n",
      "6  741948  [36.0, 9.0, 15.0, 108.0, 17.0, 23.0, 25.0, 55....  96927.0\n",
      "7  225303  [0.0, 0.0, 3.0, 41.0, 0.0, 2.0, 36.0, 84.0, 68...  97055.0\n",
      "8  787098  [4.0, 5.0, 7.0, 29.0, 7.0, 1.0, 9.0, 91.0, 33....  97950.0\n",
      "9  113073  [0.0, 0.0, 0.0, 64.0, 65.0, 30.0, 12.0, 33.0, ...  99572.0\n"
     ]
    }
   ],
   "source": [
    "import time\n",
    "\n",
    "start = time.time()\n",
    "tbl = sift1m.to_table(columns=[\"id\"], nearest={\"column\": \"vector\", \"q\": samples[0], \"k\": 10})\n",
    "end = time.time()\n",
    "\n",
    "print(f\"Time(sec): {end-start}\")\n",
    "print(tbl.to_pandas())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "31c79385",
   "metadata": {},
   "source": [
    "Without the index this is scanning through the whole dataset to compute the distance. <br/>\n",
    "\n",
    "For real-time serving we can do much better with an ANN index"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "10bfa213",
   "metadata": {},
   "source": [
    "### Build index"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "256f717a",
   "metadata": {},
   "source": [
    "Now let's build an index. Lance now supports IVF_PQ, IVF_HNSW_PQ and IVF_HNSW_SQ indexes"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3832459d",
   "metadata": {},
   "source": [
    "**NOTE** If you'd rather not wait for index build, you can download a version with the index pre-built from [here](https://eto-public.s3.us-west-2.amazonaws.com/datasets/sift/sift_ivf256_pq16.tar.gz) and skip the next cell"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "c99fdd57",
   "metadata": {
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Building vector index: IVF256,PQ16\n",
      "CPU times: user 2min 23s, sys: 2.77 s, total: 2min 26s\n",
      "Wall time: 22.7 s\n",
      "Sample 65536 out of 1000000 to train kmeans of 128 dim, 256 clusters\n",
      "Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters\n",
      "Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters\n",
      "Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters\n",
      "Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters\n",
      "Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters\n",
      "Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters\n",
      "Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters\n",
      "Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters\n",
      "Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters\n",
      "Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters\n",
      "Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters\n",
      "Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters\n",
      "Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters\n",
      "Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters\n",
      "Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters\n",
      "Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "sift1m.create_index(\n",
    "    \"vector\",\n",
    "    index_type=\"IVF_PQ\", # IVF_PQ, IVF_HNSW_PQ and IVF_HNSW_SQ are supported\n",
    "    num_partitions=256,  # IVF\n",
    "    num_sub_vectors=16,  # PQ\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cbfb0750",
   "metadata": {},
   "source": [
    "**NOTE** If you're trying this on your own data, make sure your vector (dimensions / num_sub_vectors) % 8 == 0, or else index creation will take much longer than expected due to SIMD misalignment"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e241cc5c",
   "metadata": {},
   "source": [
    "### Try nearest neighbors again with ANN index\n",
    "\n",
    "Let's look for nearest neighbors again"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "5e99289f",
   "metadata": {
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "sift1m = lance.dataset(uri)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "166c7f7f",
   "metadata": {
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Avg(sec): 0.0009334301948547364\n",
      "       id                                             vector         score\n",
      "0  378825  [20.0, 8.0, 121.0, 98.0, 37.0, 77.0, 9.0, 18.0...  16560.197266\n",
      "1  143787  [11.0, 24.0, 122.0, 122.0, 53.0, 4.0, 0.0, 3.0...  61714.941406\n",
      "2  356895  [0.0, 14.0, 67.0, 122.0, 83.0, 23.0, 1.0, 0.0,...  64147.218750\n",
      "3  535431  [9.0, 22.0, 118.0, 118.0, 4.0, 5.0, 4.0, 4.0, ...  69092.593750\n",
      "4  308778  [1.0, 7.0, 48.0, 123.0, 73.0, 36.0, 8.0, 4.0, ...  69131.812500\n",
      "5  222477  [14.0, 73.0, 39.0, 4.0, 16.0, 94.0, 19.0, 8.0,...  69244.195312\n",
      "6  672558  [2.0, 1.0, 0.0, 11.0, 36.0, 23.0, 7.0, 10.0, 0...  70264.828125\n",
      "7  365538  [54.0, 43.0, 97.0, 59.0, 34.0, 17.0, 10.0, 15....  70273.710938\n",
      "8  659787  [10.0, 9.0, 23.0, 121.0, 38.0, 26.0, 38.0, 9.0...  70374.703125\n",
      "9  603930  [32.0, 32.0, 122.0, 122.0, 70.0, 4.0, 15.0, 12...  70583.375000\n"
     ]
    }
   ],
   "source": [
    "import time\n",
    "\n",
    "tot = 0\n",
    "for q in samples:\n",
    "    start = time.time()\n",
    "    tbl = sift1m.to_table(nearest={\"column\": \"vector\", \"q\": q, \"k\": 10})\n",
    "    end = time.time()\n",
    "    tot += (end - start)\n",
    "\n",
    "print(f\"Avg(sec): {tot / len(samples)}\")\n",
    "print(tbl.to_pandas())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dbe69490",
   "metadata": {},
   "source": [
    "**NOTE** on performance, your actual numbers will vary by your storage. These numbers are run on local disk on an M2 Macbook Air. If you're querying S3 directly, HDD, or network drives, performance will be slower."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ff28eb34",
   "metadata": {},
   "source": [
    "The latency vs recall is tunable via:\n",
    "- nprobes: how many IVF partitions to search\n",
    "- refine_factor: determines how many vectors are retrieved during re-ranking"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "d26e2b27",
   "metadata": {
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 2.53 ms, sys: 3.31 ms, total: 5.84 ms\n",
      "Wall time: 4.18 ms\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>vector</th>\n",
       "      <th>score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>144678</td>\n",
       "      <td>[29.0, 10.0, 1.0, 50.0, 7.0, 89.0, 95.0, 51.0,...</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>575538</td>\n",
       "      <td>[2.0, 0.0, 1.0, 42.0, 3.0, 38.0, 152.0, 27.0, ...</td>\n",
       "      <td>76908.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>241428</td>\n",
       "      <td>[11.0, 0.0, 2.0, 118.0, 11.0, 108.0, 116.0, 21...</td>\n",
       "      <td>92877.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>220788</td>\n",
       "      <td>[0.0, 0.0, 0.0, 95.0, 0.0, 8.0, 133.0, 67.0, 1...</td>\n",
       "      <td>93305.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>833796</td>\n",
       "      <td>[1.0, 1.0, 0.0, 23.0, 11.0, 26.0, 140.0, 115.0...</td>\n",
       "      <td>95721.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>919065</td>\n",
       "      <td>[1.0, 1.0, 1.0, 42.0, 96.0, 42.0, 126.0, 83.0,...</td>\n",
       "      <td>96632.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>741948</td>\n",
       "      <td>[36.0, 9.0, 15.0, 108.0, 17.0, 23.0, 25.0, 55....</td>\n",
       "      <td>96927.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>225303</td>\n",
       "      <td>[0.0, 0.0, 3.0, 41.0, 0.0, 2.0, 36.0, 84.0, 68...</td>\n",
       "      <td>97055.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>787098</td>\n",
       "      <td>[4.0, 5.0, 7.0, 29.0, 7.0, 1.0, 9.0, 91.0, 33....</td>\n",
       "      <td>97950.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>113073</td>\n",
       "      <td>[0.0, 0.0, 0.0, 64.0, 65.0, 30.0, 12.0, 33.0, ...</td>\n",
       "      <td>99572.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       id                                             vector    score\n",
       "0  144678  [29.0, 10.0, 1.0, 50.0, 7.0, 89.0, 95.0, 51.0,...      0.0\n",
       "1  575538  [2.0, 0.0, 1.0, 42.0, 3.0, 38.0, 152.0, 27.0, ...  76908.0\n",
       "2  241428  [11.0, 0.0, 2.0, 118.0, 11.0, 108.0, 116.0, 21...  92877.0\n",
       "3  220788  [0.0, 0.0, 0.0, 95.0, 0.0, 8.0, 133.0, 67.0, 1...  93305.0\n",
       "4  833796  [1.0, 1.0, 0.0, 23.0, 11.0, 26.0, 140.0, 115.0...  95721.0\n",
       "5  919065  [1.0, 1.0, 1.0, 42.0, 96.0, 42.0, 126.0, 83.0,...  96632.0\n",
       "6  741948  [36.0, 9.0, 15.0, 108.0, 17.0, 23.0, 25.0, 55....  96927.0\n",
       "7  225303  [0.0, 0.0, 3.0, 41.0, 0.0, 2.0, 36.0, 84.0, 68...  97055.0\n",
       "8  787098  [4.0, 5.0, 7.0, 29.0, 7.0, 1.0, 9.0, 91.0, 33....  97950.0\n",
       "9  113073  [0.0, 0.0, 0.0, 64.0, 65.0, 30.0, 12.0, 33.0, ...  99572.0"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "sift1m.to_table(\n",
    "    nearest={\n",
    "        \"column\": \"vector\",\n",
    "        \"q\": samples[0],\n",
    "        \"k\": 10,\n",
    "        \"nprobes\": 10,\n",
    "        \"refine_factor\": 5,\n",
    "    }\n",
    ").to_pandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7f34575d",
   "metadata": {},
   "source": [
    "q => sample vector\n",
    "\n",
    "k => how many neighbors to return\n",
    "\n",
    "nprobes => how many partitions (in the coarse quantizer) to probe\n",
    "\n",
    "refine_factor => controls \"re-ranking\". If k=10 and refine_factor=5 then retrieve 50 nearest neighbors by ANN and re-sort using actual distances then return top 10. This improves recall without sacrificing performance too much"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "25b9d50f",
   "metadata": {},
   "source": [
    "**NOTE** the latencies above include file io as lance currently doesn't hold anything in memory. Along with index building speed, creating a purely in memory version of the dataset would make the biggest impact on performance."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7f94fcec",
   "metadata": {},
   "source": [
    "### Features and vector can be retrieved together\n",
    "\n",
    "Usually we have other feature or metadata columns that need to be stored and fetched together.\n",
    "If you're managing data and the index separately, you have to do a bunch of annoying plumbing to put stuff together. With Lance it's a single call"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "846ea382",
   "metadata": {
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>vector</th>\n",
       "      <th>item_id</th>\n",
       "      <th>revenue</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>[0.0, 16.0, 35.0, 5.0, 32.0, 31.0, 14.0, 10.0,...</td>\n",
       "      <td>0</td>\n",
       "      <td>5950.436925</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>[1.8e-43, 14.0, 35.0, 19.0, 20.0, 3.0, 1.0, 13...</td>\n",
       "      <td>1</td>\n",
       "      <td>4680.298627</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>[33.0, 1.8e-43, 0.0, 1.0, 5.0, 3.0, 44.0, 40.0...</td>\n",
       "      <td>2</td>\n",
       "      <td>5342.593212</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>[23.0, 10.0, 1.8e-43, 12.0, 47.0, 14.0, 25.0, ...</td>\n",
       "      <td>3</td>\n",
       "      <td>5080.994002</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>[27.0, 29.0, 21.0, 1.8e-43, 1.0, 1.0, 0.0, 0.0...</td>\n",
       "      <td>4</td>\n",
       "      <td>4977.299308</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>999995</th>\n",
       "      <td>999995</td>\n",
       "      <td>[8.0, 9.0, 5.0, 0.0, 10.0, 39.0, 72.0, 68.0, 3...</td>\n",
       "      <td>999995</td>\n",
       "      <td>4928.768010</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>999996</th>\n",
       "      <td>999996</td>\n",
       "      <td>[3.0, 28.0, 55.0, 29.0, 35.0, 12.0, 1.0, 2.0, ...</td>\n",
       "      <td>999996</td>\n",
       "      <td>5056.264199</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>999997</th>\n",
       "      <td>999997</td>\n",
       "      <td>[0.0, 13.0, 41.0, 72.0, 40.0, 9.0, 0.0, 0.0, 0...</td>\n",
       "      <td>999997</td>\n",
       "      <td>5930.547635</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>999998</th>\n",
       "      <td>999998</td>\n",
       "      <td>[41.0, 121.0, 4.0, 0.0, 0.0, 0.0, 0.0, 0.0, 24...</td>\n",
       "      <td>999998</td>\n",
       "      <td>5985.139759</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>999999</th>\n",
       "      <td>999999</td>\n",
       "      <td>[2.0, 4.0, 8.0, 8.0, 26.0, 72.0, 63.0, 0.0, 0....</td>\n",
       "      <td>999999</td>\n",
       "      <td>5008.962686</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1000000 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            id                                             vector  item_id  \\\n",
       "0            0  [0.0, 16.0, 35.0, 5.0, 32.0, 31.0, 14.0, 10.0,...        0   \n",
       "1            1  [1.8e-43, 14.0, 35.0, 19.0, 20.0, 3.0, 1.0, 13...        1   \n",
       "2            2  [33.0, 1.8e-43, 0.0, 1.0, 5.0, 3.0, 44.0, 40.0...        2   \n",
       "3            3  [23.0, 10.0, 1.8e-43, 12.0, 47.0, 14.0, 25.0, ...        3   \n",
       "4            4  [27.0, 29.0, 21.0, 1.8e-43, 1.0, 1.0, 0.0, 0.0...        4   \n",
       "...        ...                                                ...      ...   \n",
       "999995  999995  [8.0, 9.0, 5.0, 0.0, 10.0, 39.0, 72.0, 68.0, 3...   999995   \n",
       "999996  999996  [3.0, 28.0, 55.0, 29.0, 35.0, 12.0, 1.0, 2.0, ...   999996   \n",
       "999997  999997  [0.0, 13.0, 41.0, 72.0, 40.0, 9.0, 0.0, 0.0, 0...   999997   \n",
       "999998  999998  [41.0, 121.0, 4.0, 0.0, 0.0, 0.0, 0.0, 0.0, 24...   999998   \n",
       "999999  999999  [2.0, 4.0, 8.0, 8.0, 26.0, 72.0, 63.0, 0.0, 0....   999999   \n",
       "\n",
       "            revenue  \n",
       "0       5950.436925  \n",
       "1       4680.298627  \n",
       "2       5342.593212  \n",
       "3       5080.994002  \n",
       "4       4977.299308  \n",
       "...             ...  \n",
       "999995  4928.768010  \n",
       "999996  5056.264199  \n",
       "999997  5930.547635  \n",
       "999998  5985.139759  \n",
       "999999  5008.962686  \n",
       "\n",
       "[1000000 rows x 4 columns]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tbl = sift1m.to_table()\n",
    "tbl = tbl.append_column(\"item_id\", pa.array(range(len(tbl))))\n",
    "tbl = tbl.append_column(\"revenue\", pa.array((np.random.randn(len(tbl))+5)*1000))\n",
    "tbl.to_pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "2a1fefe2",
   "metadata": {
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "sift1m = lance.write_dataset(tbl, uri, mode=\"overwrite\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "4bca1a31",
   "metadata": {
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>revenue</th>\n",
       "      <th>vector</th>\n",
       "      <th>score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2994.968781</td>\n",
       "      <td>[29.0, 10.0, 1.0, 50.0, 7.0, 89.0, 95.0, 51.0,...</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4231.026305</td>\n",
       "      <td>[2.0, 0.0, 1.0, 42.0, 3.0, 38.0, 152.0, 27.0, ...</td>\n",
       "      <td>76908.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3340.900287</td>\n",
       "      <td>[11.0, 0.0, 2.0, 118.0, 11.0, 108.0, 116.0, 21...</td>\n",
       "      <td>92877.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4339.588996</td>\n",
       "      <td>[0.0, 0.0, 0.0, 95.0, 0.0, 8.0, 133.0, 67.0, 1...</td>\n",
       "      <td>93305.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5141.730799</td>\n",
       "      <td>[1.0, 1.0, 0.0, 23.0, 11.0, 26.0, 140.0, 115.0...</td>\n",
       "      <td>95721.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>4518.194820</td>\n",
       "      <td>[1.0, 1.0, 1.0, 42.0, 96.0, 42.0, 126.0, 83.0,...</td>\n",
       "      <td>96632.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>3383.586889</td>\n",
       "      <td>[36.0, 9.0, 15.0, 108.0, 17.0, 23.0, 25.0, 55....</td>\n",
       "      <td>96927.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>5496.905675</td>\n",
       "      <td>[0.0, 0.0, 3.0, 41.0, 0.0, 2.0, 36.0, 84.0, 68...</td>\n",
       "      <td>97055.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>5298.669719</td>\n",
       "      <td>[4.0, 5.0, 7.0, 29.0, 7.0, 1.0, 9.0, 91.0, 33....</td>\n",
       "      <td>97950.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>6742.810395</td>\n",
       "      <td>[0.0, 0.0, 0.0, 64.0, 65.0, 30.0, 12.0, 33.0, ...</td>\n",
       "      <td>99572.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       revenue                                             vector    score\n",
       "0  2994.968781  [29.0, 10.0, 1.0, 50.0, 7.0, 89.0, 95.0, 51.0,...      0.0\n",
       "1  4231.026305  [2.0, 0.0, 1.0, 42.0, 3.0, 38.0, 152.0, 27.0, ...  76908.0\n",
       "2  3340.900287  [11.0, 0.0, 2.0, 118.0, 11.0, 108.0, 116.0, 21...  92877.0\n",
       "3  4339.588996  [0.0, 0.0, 0.0, 95.0, 0.0, 8.0, 133.0, 67.0, 1...  93305.0\n",
       "4  5141.730799  [1.0, 1.0, 0.0, 23.0, 11.0, 26.0, 140.0, 115.0...  95721.0\n",
       "5  4518.194820  [1.0, 1.0, 1.0, 42.0, 96.0, 42.0, 126.0, 83.0,...  96632.0\n",
       "6  3383.586889  [36.0, 9.0, 15.0, 108.0, 17.0, 23.0, 25.0, 55....  96927.0\n",
       "7  5496.905675  [0.0, 0.0, 3.0, 41.0, 0.0, 2.0, 36.0, 84.0, 68...  97055.0\n",
       "8  5298.669719  [4.0, 5.0, 7.0, 29.0, 7.0, 1.0, 9.0, 91.0, 33....  97950.0\n",
       "9  6742.810395  [0.0, 0.0, 0.0, 64.0, 65.0, 30.0, 12.0, 33.0, ...  99572.0"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sift1m.to_table(columns=[\"revenue\"], nearest={\"column\": \"vector\", \"q\": samples[0], \"k\": 10}).to_pandas()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
